None
__DATS 6013(O10)__ - __Zhiyuan Tao__ - __G45055489__
motivations:
The TED conferences, LLC was born with the initial of fields "Technology, Entertainment, and Design" and now it extends to almost all the fields. English teachers always ask us to learn English by watching TED talks. Professors will recommend us to obtain new insights, and I may also find my favorite scholars have delivered TED talks.
\
The most valuable thing is that the successful speakers condense their half-life research, experience and share with us in several minutes.
\
Since there already 4000+ talks, the data is big enough for us to analysize and find some useful information.
\ The main objectives of this project are:
# pip install yellowbrick
# pip install wordcloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for data preprocessing
import ast
# for text analysis
from sklearn.feature_extraction import text
import os
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import re
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# import datetime # seems not necessary
TED - Ultimate Dataset
https://www.kaggle.com/miguelcorraljr/ted-ultimate-dataset
TED scraper
https://github.com/corralm/TEDscraper
all the 4000+ talks from the first online publishment date 2006-06-27 to 2020-04-27
df = pd.read_csv("ted_talks_en.csv")
ted = df.drop(['transcript'], axis=1)
ted.columns
ted.head(1)
#pd.options.display.max_rows = 100
ted['recorded_date'].iloc[3831]
ted['recorded_date'].iloc[3831] = '2019-08-01'
ted['recorded_date'].iloc[3831]
# duration from second to mins
ted['duration_min']=ted['duration']/60
ted['duration_min']=ted['duration_min'].astype(float)
ted['duration_min']=ted['duration_min'].round(decimals=2)
ted = ted.drop(['duration'], axis=1)
ted.head(1)
#pd.options.display.max_rows = 100
#pd.set_option('max_colwidth',50)
ted.describe()
ted[['views', 'comments','duration_min']].corr()
def get_top10(feature):
popular_talks = ted.sort_values(
by=feature,ascending=False).set_index(feature).reset_index().head(10)
return popular_talks
# discussion rate
ted['dis_rate'] = ted['comments']/ted['views']
def get_top10(feature):
popular_talks = ted[['title','speaker_1','views','comments','duration_min',
'recorded_date','topics', 'dis_rate']].sort_values(
by=feature,ascending=False).set_index(feature).reset_index().head(10)
return popular_talks
pd.set_option('max_colwidth',50)
#get_top10('views')
#get_top10('comments')
#get_top10('dis_rate')
fig = px.bar(get_top10('views').sort_values('views'), x='views', y='title',# orientation='h',
hover_data=["speaker_1", "comments","recorded_date",'topics'],
title='top 10 views')
fig.update_layout(xaxis_tickfont_size=14)
fig.update_layout(yaxis_tickfont_size=16)
#fig.update_layout(uniformtext_minsize=30)#, uniformtext_mode='hide')
fig.show()
fig = px.bar(get_top10('comments').sort_values('comments'), x='comments', y='title',# orientation='h',
hover_data=["speaker_1", "comments","recorded_date",'topics'],
#height=300,
title='top 10 comments')
fig.update_layout(xaxis_tickfont_size=14)
fig.update_layout(yaxis_tickfont_size=16)
fig.show()
fig = px.bar(get_top10('dis_rate').sort_values('dis_rate'), x='dis_rate', y='title',# orientation='h',
hover_data=["speaker_1", "comments","recorded_date",'topics'],
#height=300,
title='top 10 discussion rate')
fig.update_layout(xaxis_tickfont_size=14)
fig.update_layout(yaxis_tickfont_size=16)
fig.show()
months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ted['year'] = ted['recorded_date'].apply(lambda x: x.split('-')[0])
year_df = pd.DataFrame(ted['year'].value_counts().reset_index())
year_df.columns = ['year', 'talks']
year_df = year_df.sort_values(by='year', ascending=True)
# drop '2020' since it's not complete
fig = go.Figure(go.Scatter(x=year_df['year'][:-1], y=year_df['talks'][:-1], mode='lines+markers'))
fig.show()
hmap_df = ted.copy()
hmap_df['recorded_date'] = hmap_df['recorded_date'].apply(lambda x: month_order[int(x.split('-')[1]) - 1] + " " + str(x.split('-')[0]))
hmap_df = pd.pivot_table(hmap_df[['recorded_date', 'title']], index='recorded_date', aggfunc='count').reset_index()
hmap_df['month_num'] = hmap_df['recorded_date'].apply(lambda x: months[x.split()[0]])
hmap_df['year'] = hmap_df['recorded_date'].apply(lambda x: x.split()[1])
hmap_df = hmap_df.sort_values(['year', 'month_num'])
hmap_df = hmap_df[['month_num', 'year', 'title']]
hmap_df = hmap_df.pivot('month_num', 'year', 'title')
hmap_df = hmap_df.fillna(0)
f, ax = plt.subplots(figsize=(18, 10))
sns.heatmap(hmap_df, annot=True, linewidths=.5, ax=ax, fmt='n', yticklabels=month_order)
plt.show()
# transfer 'topics' from str to list
ted['topics'] = ted['topics'].apply(lambda x: ast.literal_eval(x))
ted['topics']
# unfold all the topics' tag
topics = ted.apply(lambda x: pd.Series(x['topics']),axis=1).stack().reset_index(level=1, drop=True)
topics.name = 'topics'
topics
topics_df = ted.drop('topics', axis=1).join(topics)
topics_df
len(topics_df['topics'].value_counts())
pop_topics = pd.DataFrame(topics_df['topics'].value_counts()).reset_index()
pop_topics.columns = ['topics', 'talks']
fig = px.pie(pop_topics.head(15), values='talks', names='topics', title='Top Topics',
width=1000, height=800)
fig.update_traces(textposition='inside', textinfo='value+label')
fig.show()
fig = px.pie(pop_topics, values='talks', names='topics', title='All Topics',
width=1000, height=800)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
topics_year = topics_df[(topics_df['topics'].isin(pop_topics.head(15)['topics'])) & (topics_df['topics'] != 'TEDx')]
topics_year['year'] = topics_year['year'].astype('int')
topics_year = topics_year[topics_year['year'] > 2002]
themes = list(pop_topics.head(15)['topics'])
themes.remove('TEDx')
ctab = pd.crosstab([topics_year['year']], topics_year['topics']).apply(lambda x: x/x.sum(), axis=1)
ctab[themes].plot(kind='bar', stacked=True, colormap='tab20', figsize=(12,8)).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
content = df.copy()
content.head(1)
def find_topic(topic):
has_topic = []
for t_list in df['topics']:
if topic.lower() in ast.literal_eval(t_list):
has_topic.append(1)
else:
has_topic.append(0)
return has_topic
# add columns for selected topics
content['is_science'] = find_topic('science')
content['is_business'] = find_topic('business')
content['is_psychology'] = find_topic('psychology')
content.head(1)
content = content.loc[(content['is_science']==1) | (content['is_business']==1) |
(content['is_psychology']==1), : ].reset_index(drop=True)
# create new DataFrames for each topic
science = content.loc[(content['is_science']==1), 'talk_id':'transcript'].reset_index(drop=True)
business = content.loc[(content['is_business']==1), 'talk_id':'transcript'].reset_index(drop=True)
psychology = content.loc[(content['is_psychology']==1), 'talk_id':'transcript'].reset_index(drop=True)
print('science', science.shape)
print('business', business.shape)
print('psychology', psychology.shape)
content
def combine_transcripts(transcript_list):
corpus = ' '.join(transcript_list)
return corpus
def transcripts_to_dict(df, topic_list):
"""Returns a dictionary of transcripts for each topic.
:param df: DataFrame
:param topic_list: List of topics
"""
ted_dict = {}
for topic in topic_list:
# filter DataFrame to specific series and convert it to a list
filter_string = 'is_' + str(topic)
text_list = df.loc[(df[filter_string]==1), 'transcript'].to_list()
# call combine_transcripts function to return combined text
combined_text = combine_transcripts(text_list)
# add combined text to dict
ted_dict[topic] = combined_text
return ted_dict
# create dictionary from the DataFrame
transcript_dict = transcripts_to_dict(content, ['science', 'business', 'psychology'])
# construct DataFrame from dictionary
transcripts3 = pd.DataFrame.from_dict(transcript_dict, orient='index')
transcripts3.rename({0: 'transcript'}, axis=1, inplace=True)
pd.set_option('max_colwidth',500)
transcripts3
pd.set_option('max_colwidth',50)
corpus1 = ' '.join(transcripts3.loc['science'])
corpus2 = ' '.join(transcripts3.loc['business'])
corpus3 = ' '.join(transcripts3.loc['psychology'])
#corpus3
# add custom stop words
custom_stop_words = {
'know','think','people','thing','things','something','want','world','come',
'one','see','now','going','way','u','much','laughter','say','said','us','really','actually',
'make','will','well'
}
STOPWORDS_new = STOPWORDS | custom_stop_words
wordcloud = WordCloud(stopwords=STOPWORDS_new, background_color='white',width=2400,height=2000).generate(corpus1)
plt.figure(figsize=(12,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud = WordCloud(stopwords=STOPWORDS_new, background_color='white',width=2400,height=2000).generate(corpus2)
plt.figure(figsize=(12,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud = WordCloud(stopwords=STOPWORDS_new, background_color='white',width=2400,height=2000).generate(corpus3)
plt.figure(figsize=(12,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
rec = ted[['title','description']]
rec
# def remove_tags(string):
# result = re.sub('<.*?>','',string)
# return result
# rec['description']=rec['description'].apply(lambda cw : remove_tags(cw))
# Transforms text to feature vectors
Text=rec['description'].tolist()
tfidf=text.TfidfVectorizer(input=Text,stop_words="english")
matrix=tfidf.fit_transform(Text)
print(matrix.shape)
# Get Similarity Scores using cosine similarity
cosine_sim=cosine_similarity(matrix)
print(cosine_sim)
indices = pd.Series(rec['title'])
def recommend_talks(name):
talks=[]
idx = indices[indices == name].index[0]
sort_index = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
top_10 = sort_index.iloc[1:11]
for i in top_10.index:
talks.append(indices[i])
print(*talks, sep='\n')
def rec():
try:
i = 1
while(i > 0):
name = input("\n Enter The title of the TED Talk : ")
if name.lower() == 'quit':
break
else:
print("\n",recommend_talks(name))
except KeyboardInterrupt:
print("The TED Talk does not exist\n")
rec()
except IndexError:
print("The TED Talk does not exist\n")
rec()
print("To exit Enter \"quit\" \n")
rec()